
package com.thundern.tdcrawler.selector;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import javax.swing.text.StyledEditorKit.BoldAction;

public class SmartContentSelector implements Selector {
	
	public SmartContentSelector() {}
	
	public String select(String html) {
		html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
        html = html.replaceAll("(?is)<!--.*?-->", "");				// remove html comment
        html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
        html = html.replaceAll("(?is)<style.*?>.*?</style>", "");   // remove css
        html = html.replaceAll("&.{2,5};|&#.{2,5};", " ");			// remove special char
        html = html.replaceAll("(?is)<.*?>", "");
		
        List<String> lines;
        int blocksWidth = 3;
        int threshold = 86;
        int start = -1, end = -1;
        
        StringBuilder text = new StringBuilder();
        List<Integer> indexDistribution = new ArrayList<Integer>();
        
        lines = Arrays.asList(html.split("\n"));
        
        for(int i=0; i<lines.size() - blocksWidth; i++) {
        	int wordsNum = 0;
        	for (int j = i; j < i+blocksWidth; j++) {
        		lines.set(j, lines.get(j).replaceAll("\\s+", ""));
        		wordsNum += lines.get(j).length();
        		
			}
        	indexDistribution.add(wordsNum);
        }

        boolean boolStart = false; boolean boolend = false;
        text.setLength(0);
        
        for(int i=0; i<indexDistribution.size()-1; i++) {
        	if(indexDistribution.get(i)>threshold && !boolStart) {
        		if(indexDistribution.get(i+1).intValue() != 0 ||
        				indexDistribution.get(i+2).intValue() != 0||
        				indexDistribution.get(i+3).intValue() != 0) {
        			boolStart = true;
        			start = i;
        			continue;
        		}
        	}
        	if(boolStart) {
        		if(indexDistribution.get(i).intValue()==0||indexDistribution.get(i+1).intValue()==0) {
        			end = i;
        			boolend = true;
        		}
        	}
        	StringBuilder temp = new StringBuilder();
        	if(boolend) {
        		for(int ii=start; ii<=end; ii++) {
        			if(lines.get(ii).length()<5) continue;
        			temp.append(lines.get(ii)+"\n");
        		}
        		String str = temp.toString();
        		if(str.contains("Copyright")) continue;
        		text.append(str);
        		boolStart = boolend = false;
        	}
        }        
		return text.toString();
	}
	
	public List<String> selectList(String text) {
	     throw new UnsupportedOperationException();
	}
}
